Code
datatable(a_names, filter = "top")datatable(a_names, filter = "top")1. Summary Table
a_names <- a_names |>
rename(Sex = Gender)
allison_names <- a_names |>
filter(Name == "Allison") |>
group_by(State, Sex) |>
summarize(Count) |>
group_by(State, Sex) |>
mutate(Total = sum(Count)) |>
distinct(Total, Sex) |>
pivot_wider(names_from = Sex, values_from = Total) |>
replace_na(list(M = 0, `F` = 0)) # https://tidyr.tidyverse.org/reference/replace_na.html`summarise()` has grouped output by 'State', 'Sex'. You can override using the
`.groups` argument.
allison_names |>
kable(
col.names = c("State", "Number of Females Named Allison", "Number of Males Named Allison"),
caption = "Summarizing Allison Names By State and Sex"
) |>
kable_minimal() |>
kable_styling(bootstrap_options = "striped",
position = "left", full_width = F,
font_size = 15)| State | Number of Females Named Allison | Number of Males Named Allison |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
2. Filtering for only Female
allison_names |>
select(State, `F`) |>
kable(
col.names = c("State", "Number of Females with Allison Names")
)| State | Number of Females with Allison Names |
|---|---|
| AK | 232 |
| AL | 1535 |
| AR | 1198 |
| AZ | 1880 |
| CA | 12413 |
| CO | 1594 |
| CT | 1099 |
| DC | 321 |
| DE | 294 |
| FL | 4455 |
| GA | 3257 |
| HI | 183 |
| IA | 1477 |
| ID | 451 |
| IL | 5110 |
| IN | 3067 |
| KS | 1283 |
| KY | 1905 |
| LA | 1209 |
| MA | 2218 |
| MD | 2229 |
| ME | 340 |
| MI | 4014 |
| MN | 2374 |
| MO | 2882 |
| MS | 817 |
| MT | 226 |
| NC | 3435 |
| ND | 285 |
| NE | 807 |
| NH | 412 |
| NJ | 3052 |
| NM | 399 |
| NV | 729 |
| NY | 5747 |
| OH | 5487 |
| OK | 1421 |
| OR | 1186 |
| PA | 4307 |
| RI | 306 |
| SC | 1228 |
| SD | 376 |
| TN | 2488 |
| TX | 10192 |
| UT | 1125 |
| VA | 3220 |
| VT | 135 |
| WA | 1956 |
| WI | 2367 |
| WV | 813 |
| WY | 142 |
# Select was used since there is one state per row3. Time Series Plot
allison_years <- a_names |>
filter(Name == "Allison") |>
group_by(Year) |>
mutate(Total = sum(Count)) |>
summarize(Total) |>
distinct(Year, .keep_all = TRUE)`summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
allison_years |>
ggplot(mapping = aes(x = Year, y = Total)) +
geom_line(size = 1.25, color = "Blue") +
theme(axis.title.y = element_blank(),
plot.title.position = "plot") +
labs(title = "Popularity of Allison Names over the Years")4. Creating a Linear Model
allison_years |>
lm(Total ~ Year, data = _)
Call:
lm(formula = Total ~ Year, data = allison_years)
Coefficients:
(Intercept) Year
209815.1 -101.6
5. Visualizing Regression Model
allison_years |>
ggplot(mapping = aes(x = Year, y = Total)) +
geom_point() +
stat_smooth(method = "lm") +
theme(axis.title.y = element_blank(),
plot.title.position = "plot") +
labs(title = "Popularity of Allison Names over the Years")`geom_smooth()` using formula 'y ~ x'
6. Regression Equation
Predicted Popularity = 209815.1 - 101.6(Year)
7. Residual Plot
allison_years |>
lm(Total ~ Year, data = _) |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point()In the residual plot I notice two peaks where the popularity of the name “Allison” is more than expected.
8. Conclusion
Your name is still cool, but unfortunately it’s not as cool as it used to be.
9. Popularity of the Three Names
a_names |>
filter(Name %in% c("Allen", "Alan", "Allan"), Sex == "M") |>
group_by(Year, Name) |>
mutate(Total = sum(Count)) |>
summarize(Total) |>
distinct(Year, .keep_all = TRUE) |>
ggplot(mapping = aes(x = Year, y = Total, color = Name)) +
geom_line(size = 1.25) +
labs(title = "Popularity of Names by Year") +
theme(axis.title.y = element_blank(),
plot.title.position = "plot")`summarise()` has grouped output by 'Year', 'Name'. You can override using the
`.groups` argument.
10. “Allan” Spelling in Year 2000
allan_names <- a_names |>
filter(Name %in% c("Allen", "Alan", "Allan"), Sex == "M",
Year == 2000, State %in% c("PA", "CA")) |>
group_by(Year, Name) |>
ungroup() |>
select(Name, State, Count) |>
pivot_wider(names_from = Name, values_from = Count)
allan_names |>
kable(
col.names = c("State", "Number of Alan Names", "Number of Allen Names", "Number of Allan Names")
)| State | Number of Alan Names | Number of Allen Names | Number of Allan Names |
|---|---|---|---|
| CA | 579 | 176 | 131 |
| PA | 51 | 56 | 12 |
11. Converting to Percentages
allan_names |>
group_by(State) |>
mutate(total = sum(Alan, Allen, Allan),
Alan = Alan/total,
Allen = Allen/total,
Allan = Allan/total) |>
select(State, Alan, Allen, Allan) |>
kable(
col.names = c("State", "Percentage of Alan Names", "Percentage of Allen Names", "Percentage of Allan Names" )
)| State | Percentage of Alan Names | Percentage of Allen Names | Percentage of Allan Names |
|---|---|---|---|
| CA | 0.6534989 | 0.1986456 | 0.1478555 |
| PA | 0.4285714 | 0.4705882 | 0.1008403 |